Abstract
MSDS6306: Doing Data Science - Case Study 01: Beer & Brewery. I cleaned the data and performed EDA to visualize distribution (log transformed when necessary) as well as summary statistics. I conducted various hypothesis tests, k-NN cross validation on normalized values, linear regression model, and correlation tests to derive r, its p-value, r squared. Lastly, I examined the geographical distribution of IBU in order to recommend appropriate beer attributes that cater to the unmet needs of the regional markets.Load the required packages
library(tidyverse)
library(cowplot)
library(maps)
library(ggmap)
library(viridis)
library(plotly)
library(caret) #confusion matrix
library(e1071)
library(class) #knn.cv
Import the data
breweries = read_csv('https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv')
beer = read_csv('https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv')
Breweries data has 558 observations and four columns:
dim(breweries)
## [1] 558 4
spec(breweries)
## cols(
## Brew_ID = col_double(),
## Name = col_character(),
## City = col_character(),
## State = col_character()
## )
Beer data has 2410 observations and seven columns:
dim(beer)
## [1] 2410 7
spec(beer)
## cols(
## Name = col_character(),
## Beer_ID = col_double(),
## ABV = col_double(),
## IBU = col_double(),
## Brewery_id = col_double(),
## Style = col_character(),
## Ounces = col_double()
## )
Merge datasets by primary key
df = merge(breweries, beer, by.x = 'Brew_ID', by.y = 'Brewery_id')
head(df)
Rename Name.x and Name.y
df = rename(df, Brewery = Name.x, Beer = Name.y)
head(df)
Clean the data:
cbind(lapply(lapply(df, is.na), sum))
## [,1]
## Brew_ID 0
## Brewery 0
## City 0
## State 0
## Beer 0
## Beer_ID 0
## ABV 62
## IBU 1005
## Style 5
## Ounces 0
final = df %>% na.exclude()
sum(as.numeric(is.na.data.frame(final)))
## [1] 0
Final dataset has 1403 observations and 10 columns:
1. Brew_ID: Unique identifier of the brewery
2. Brewery: Name of the brewery
3. City: City where brewery is located
4. State: U.S. state where brewery is located
5. Beer: Name of the beer
6. Beer_ID: Unique identifier of the beer
7. ABV: Alcohol by volume of the beer
8. IBU: International Bitterness Units of the beer
9. Ounces: Ounces of the beer
10. Style: Style of the beer
dim(final)
## [1] 1403 10
str(final)
## 'data.frame': 1403 obs. of 10 variables:
## $ Brew_ID: num 1 1 1 1 1 1 2 2 2 2 ...
## $ Brewery: chr "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" ...
## $ City : chr "Minneapolis" "Minneapolis" "Minneapolis" "Minneapolis" ...
## $ State : chr "MN" "MN" "MN" "MN" ...
## $ Beer : chr "Pumpion" "Stronghold" "Parapet ESB" "Get Together" ...
## $ Beer_ID: num 2689 2688 2687 2692 2691 ...
## $ ABV : num 0.06 0.06 0.056 0.045 0.049 0.048 0.042 0.08 0.125 0.077 ...
## $ IBU : num 38 25 47 50 26 19 42 68 80 25 ...
## $ Style : chr "Pumpkin Ale" "American Porter" "Extra Special / Strong Bitter (ESB)" "American IPA" ...
## $ Ounces : num 16 16 16 16 16 16 16 16 16 16 ...
## - attr(*, "na.action")= 'exclude' Named int [1:1007] 19 35 36 37 38 39 40 41 42 43 ...
## ..- attr(*, "names")= chr [1:1007] "19" "35" "36" "37" ...
The first six and last rows of the dataframe
head(final)
tail(final)
How many breweries are in each state?
breweries %>% group_by(State) %>% tally(sort=T)
Plot into a heatmap
states = map_data('state')
st_abb = data.frame(abb = state.abb, region = tolower(state.name))
brew = breweries %>% group_by(State) %>% tally(sort=T)
brew = subset(brew, State != 'HI' & State != 'AK')
brew = rename(brew, abb = State, count = n)
brew = merge(brew, st_abb, by='abb')
geo = merge(states, brew, by='region', all.x=T)
geo = geo[order(geo$order),]
center <- data.frame(region=tolower(state.name), long=state.center$x, lat=state.center$y)
center <- merge(center, brew, by="region", all.x = TRUE)
ggplot(geo, aes(x=long,y=lat)) +
geom_polygon(aes(group=group, fill=count)) +
geom_text(data=center, aes(long, lat, label=count)) +
scale_fill_gradient(low = "slategray1",
high = "royalblue4",
guide = "colorbar") +
ggtitle("Breweries per State") +
coord_map() + theme(axis.title = element_text(size = 20)) + theme_void() + theme(legend.position = c(0.9, 0.4), plot.title = element_text(size= 20, hjust=0.1, margin = margin(b = -0.1, t = 0.4, l = 2, unit = "cm")))
Compute median ABV for each state
median = final %>% select(State, ABV, IBU) %>% group_by(State) %>% summarize(med_abv = median(ABV), med_ibu = median(IBU))
arrange(median, -med_abv)
arrange(median, -med_ibu)
median %>% mutate(perc = med_abv*100, State = fct_reorder(State, perc)) %>% ggplot(aes(State, perc)) + geom_point(size=4, color="royalblue2") + coord_flip() + labs(y='ABV (%)', title='Median Alcohol by Volume of each State', subtitle = 'Decreasing Order') + theme_minimal() + theme(plot.title = element_text(size= 20), axis.title = element_text(size = 20))
Compute median IBU for each state
arrange(median, -med_ibu)
# no geom_segment
median %>% mutate(State = fct_reorder(State, med_ibu)) %>% ggplot(aes(State, med_ibu)) + geom_point(size=4, color="indianred2") + coord_flip() + labs(y='IBU', title='Median International Bitterness Unit of each State', subtitle = 'Decreasing Order') + theme_minimal() + theme(plot.title = element_text(size= 20), axis.title = element_text(size = 20))
Kentucky has the max ABV
max(final$ABV)
## [1] 0.125
grep(.125, final$ABV)
## [1] 9
final$State[9]
## [1] "KY"
Oregon has the highest IBU
max(final$IBU)
## [1] 138
grep(138, final$IBU)
## [1] 1132
final$State[1132]
## [1] "OR"
Summary statistics and distribution of ABV
| Statistic | Percent |
|---|---|
| Min | 2.7 |
| Med | 5.7 |
| Mean | 6 |
| Max | 12.5 |
| IQR | 1.8 |
Right skew distribution suggests lower production at increasing ABV. Consumer preference (and thus sales) drives demand to infer most lean towards lower concentration of ABV. The median value of 5.7% is a better indicator of the distribution because the mean is pulled towards the skewed tail.
summary(final$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02700 0.05000 0.05700 0.05992 0.06800 0.12500
fivenum(final$ABV)
## [1] 0.027 0.050 0.057 0.068 0.125
IQR(final$ABV)
## [1] 0.018
final %>% mutate(abvperc = ABV*100) %>% ggplot(aes(abvperc)) + geom_boxplot(fill='royalblue', alpha=.3) + labs(x='ABV (%)', title='Distribution of Alcohol by Volume') + theme_cowplot() + theme(plot.title = element_text(size= 20), axis.ticks.y=element_blank(), axis.title.y=element_blank(), axis.text.y=element_blank(), legend.position = 'none', axis.title = element_text(size = 20)) + geom_segment(aes(x = 6, y = -.05, xend = 6, yend = .05, col='red'))
# density plot
final %>% mutate(abvperc = ABV*100) %>% ggplot(aes(abvperc)) + geom_density(fill='indianred2', color='indianred3', alpha=.9) + geom_vline(xintercept=5.7, col='cornflowerblue') + geom_vline(xintercept = 6, linetype='dotted', col='paleturquoise3', size=1.1) + labs(x='ABV (%)', title='Distribution of Alcohol by Volume') + theme_minimal_hgrid() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))
Relationship between ABV and IBU using polynomial regression model
The ABV and IBU values were log transformed. The linear regression line is depicted in green whereas polynomial regression line is in red. The latter model better fits the data however both are pretty similar.
final2 = final %>% mutate(log_abv = log(ABV), log_ibu = log(IBU))
x = final2$log_abv
y = final2$log_ibu
model3 = lm(y~x)
new_x = cbind(x, x^2)
model4 = lm(y~new_x)
ggplot(data = final2) + geom_point(aes(x = log_abv,y = log_ibu)) +
geom_line(aes(x = log_abv,y = model3$fit), color = "blue", lwd=.8) +
geom_line(aes(x = log_abv,y = model4$fit), color = "red", lwd=.8) +
labs(title='Relationship between Bitterness of Beer and its Alcohol Content', subtitle = 'Linear & Polynomial Regression Model on Log Transformed Data', x= 'log(ABV)', y='log(IBU)') + theme_classic()
new calculation
final = final %>% mutate(log_abv = log(ABV))
cor.test(x=final$log_abv, y=final$IBU)
##
## Pearson's product-moment correlation
##
## data: final$log_abv and final$IBU
## t = 34.032, df = 1401, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6430317 0.7004025
## sample estimates:
## cor
## 0.6727271
model = lm(IBU~log_abv, data=final)
summary(model)
##
## Call:
## lm(formula = IBU ~ log_abv, data = final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.393 -12.410 -0.622 12.989 91.579
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 272.589 6.773 40.24 <2e-16 ***
## log_abv 80.972 2.379 34.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.22 on 1401 degrees of freedom
## Multiple R-squared: 0.4526, Adjusted R-squared: 0.4522
## F-statistic: 1158 on 1 and 1401 DF, p-value: < 2.2e-16
new ibu abv plot on logged abv
final %>% ggplot(aes(x=log_abv, y=IBU)) + geom_point(color="black", fill="#69b3a2", shape=22, alpha=0.5, size=3, stroke = 1) + geom_line(aes(x = log_abv,y = model$fit), color = "lightpink3", lwd=.8) + labs(title='Relationship between Bitterness of Beer and its Alcohol Content', subtitle = 'Linear Regression Model on Log Transformed ABV', x= 'ABV (logged)', y='IBU') + theme_classic() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))
ABV vs. IBU by Style
View all 90 different styles
final %>% group_by(Style) %>% tally(sort=T)
Style df of just IPA/ale styles. 32 different styles of IPA and ale encompassing 944 observations and 10 columns
style = final %>% group_by(Style) %>% filter(grepl('\\sAle|IPA$', Style))
dim(style)
## [1] 944 11
Distinguish ale from ipa
all_style = final %>% group_by(Style)
keep = grep('\\sAle', all_style$Style)
ale = all_style[keep,]
rem = grep('India', ale$Style)
ale = ale[-rem,] #552 obs, 27 ale styles
ale$Style = 'ale'
ipa = grep(('\\sIndia\\sPale\\sAle|IPA$'), all_style$Style)
ipa = all_style[ipa,] # 392 obs, 5 ipa styles
ipa$Style = 'ipa'
ipa_ale = rbind(ipa, ale)
ipa_ale$Style = as.factor(ipa_ale$Style)
Code the plots
pmain = ipa_ale %>% ggplot(aes(x=ABV, y=IBU, shape=factor(Style))) + geom_point(aes(color=factor(Style))) + theme(legend.position = 'none', axis.title = element_text(size = 20), panel.background = element_rect(fill = "gray97", color = NA))
xdens = axis_canvas(pmain, axis='x') + geom_density(data=ipa_ale, aes(ABV, fill=Style), alpha=.7, size=.2)
ydens = axis_canvas(pmain, axis='y', coord_flip = TRUE) + geom_density(data=ipa_ale, aes(IBU, fill=Style), alpha=.7, size=.2) + coord_flip()
p1 = insert_xaxis_grob(pmain, xdens, grid::unit(.2, 'null'), position='top')
p2 = insert_yaxis_grob(p1, ydens, grid::unit(.2, 'null'), position='right')
Plot scatterplot with marginal plot on the axis
ggdraw(p2) + draw_label('ABV vs. IBU by Style', x=.12, y=.98, size=20) + draw_label('IPA', x=.52, y=.95, size=13, color='paleturquoise4', fontface = 'bold') + draw_label('Ale', x=.35, y=.97, size=13, color='rosybrown', fontface = 'bold')
Conduct KNN classifier to investigate the difference with respect to ABV and IBU between IPAs and other types of Ale
normalize = function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
ipa_ale = ipa_ale %>% mutate(norm_abv = normalize(ABV), norm_ibu = normalize(IBU))
model = knn.cv(ipa_ale[,c(12:13)], ipa_ale$Style, k=11)
confusionMatrix(model, ipa_ale$Style)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ale ipa
## ale 460 117
## ipa 92 275
##
## Accuracy : 0.7786
## 95% CI : (0.7507, 0.8047)
## No Information Rate : 0.5847
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.5399
##
## Mcnemar's Test P-Value : 0.09689
##
## Sensitivity : 0.8333
## Specificity : 0.7015
## Pos Pred Value : 0.7972
## Neg Pred Value : 0.7493
## Prevalence : 0.5847
## Detection Rate : 0.4873
## Detection Prevalence : 0.6112
## Balanced Accuracy : 0.7674
##
## 'Positive' Class : ale
##
Conduct t.test
ipa_ale = ipa_ale %>% mutate(logabv=log(ABV), logibu=log(IBU))
t.test(logabv~Style, data=ipa_ale)
##
## Welch Two Sample t-test
##
## data: logabv by Style
## t = -16.91, df = 849.95, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group ale and group ipa is not equal to 0
## 95 percent confidence interval:
## -0.2260548 -0.1790364
## sample estimates:
## mean in group ale mean in group ipa
## -2.889941 -2.687395
t.test(logibu~Style, data=ipa_ale)
##
## Welch Two Sample t-test
##
## data: logibu by Style
## t = -30.993, df = 875.42, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group ale and group ipa is not equal to 0
## 95 percent confidence interval:
## -0.8911237 -0.7849819
## sample estimates:
## mean in group ale mean in group ipa
## 3.399569 4.237622
IBU distribution on US map (static plot)
states = map_data('state')
usabeer = final %>% select(City, State, ABV, IBU)
usabeer = subset(usabeer, State != 'HI' & State != 'AK')
usabeer$citystate = str_c(usabeer$City, ", ", usabeer$State)
usabeer = cbind(geocode(as.character(usabeer$citystate)), usabeer)
usabeer = usabeer %>% arrange(IBU) %>% mutate(cs=factor(citystate, unique(citystate)))
ggplot() + geom_polygon(data=states, aes(x=long, y=lat, group=group), fill='grey', alpha=.3) + geom_point(data=usabeer, aes(lon, y=lat, size=IBU, color=IBU, alpha=IBU), shape=20) + scale_size_continuous(name='IBU', range=c(1,10), breaks=c(1, 25, 50, 75, 100)) + scale_alpha_continuous(name='IBU', range=c(.1, .9), breaks=c(1, 25, 50, 75, 100)) + scale_color_viridis(option='magma', direction=-1, breaks=c(1, 25, 50, 75, 100), name='IBU') + theme_void() + coord_map() + borders('state') + guides( colour = guide_legend()) +
ggtitle("Distribution of International Bitterness Unit by Cities") +
theme(
legend.position = c(0.85, 0.25),
text = element_text(color = "#22211d"),
plot.background = element_rect(fill = "#f5f5f2", color = NA),
panel.background = element_rect(fill = "#f5f5f2", color = NA),
legend.background = element_rect(fill = "#f5f5f2", color = NA),
plot.title = element_text(size= 20, hjust=0.1, margin = margin(b = -.5, t = 0.4, l = 2, unit = "cm")),
)
Plot IBU distribution
summary(final$IBU)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 21.00 35.00 42.74 64.00 138.00
usabeer %>% ggplot(aes(IBU)) + geom_density(fill='royalblue', alpha=.7) + geom_vline(xintercept=35, col='red') + geom_vline(xintercept = 42.74, linetype='dotted', col='turquoise', size=1.1) + labs(title='International Bitterness Unit') + theme_minimal_hgrid() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))
IBU distribution on US map (PLOTLY)
usabeer = usabeer %>% mutate( mytext=paste(citystate, "\n", "IBU: ", IBU, sep=""))
p.usa = ggplot() + geom_polygon(data=states, aes(x=long, y=lat, group=group)) + geom_point(data=usabeer, aes(x=lon, y=lat, size=IBU, color=IBU, alpha=IBU, text=mytext)) + scale_size_continuous(range=c(1,7)) + scale_color_viridis(option='inferno', direction = -1) + theme_void() + coord_map() + borders('state', fill='whitesmoke', alpha=.3) + labs(title='Distribution of International Bitterness Unit by Cities') + theme(legend.position = 'none', plot.title = element_text(size=25, hjust=.5))
p.usa = ggplotly(p.usa, tooltip='text')
p.usa
Number of breweries per city in descending order
nbrew = final %>% select(IBU, City, State) %>% group_by(City, State) %>% tally(sort=T)
nbrew
MAX IBU for each city
maxibu = final %>% select(IBU, City, State) %>% group_by(State, City) %>% slice(which.max(IBU))
maxibu = arrange(maxibu, -IBU)
maxibu
merge, conduct cor test
ibu.brew.cor = as.data.frame(cbind(maxibu$IBU, nbrew$n))
ibu.brew.cor = rename(ibu.brew.cor, max_ibu=V1, n_brew = V2)
cor(ibu.brew.cor) # .7939
## max_ibu n_brew
## max_ibu 1.0000000 0.7938968
## n_brew 0.7938968 1.0000000
cor.test(x=ibu.brew.cor$n_brew, y=ibu.brew.cor$max_ibu, method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: ibu.brew.cor$n_brew and ibu.brew.cor$max_ibu
## t = 22.196, df = 289, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7471146 0.8328525
## sample estimates:
## cor
## 0.7938968
Boston, MA IBU statistics. Boston is ranked 12th in breweries per city yet its maximum IBU is only 45. That’s 50% less than the max IBU compared to cities with similar demographics. In fact, if you take the max IBU of all the cities in MA and extract the median, Boston is still 30 IBU lower. I believe that Boston is an untapped market with a need for higher IBU beer production.
ma = maxibu %>% select(State, IBU) %>% filter(State == 'MA') %>% summarize(value = IBU)
## Adding missing grouping variables: `City`
## `summarise()` has grouped output by 'State'. You can override using the `.groups` argument.
mean(ma$value)
## [1] 68.09091
fivenum(ma$value)
## [1] 35.0 45.0 69.0 82.5 130.0